In [ ]:
%matplotlib nbagg
import matplotlib.pyplot as plt
import numpy as np


In [ ]:
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
import numpy as np
np.set_printoptions(suppress=True)

digits = load_digits()
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

Removing mean and scaling variance


In [ ]:
from sklearn.preprocessing import StandardScaler

1) Instantiate the model


In [ ]:
scaler = StandardScaler()

2) Fit using only the data.


In [ ]:
scaler.fit(X_train)

3) transform the data (not predict).


In [ ]:
X_train_scaled = scaler.transform(X_train)

In [ ]:
X_train.shape

In [ ]:
X_train_scaled.shape

The transformed version of the data has the mean removed:


In [ ]:
X_train_scaled.mean(axis=0)

In [ ]:
X_train_scaled.std(axis=0)

In [ ]:
X_test_transformed = scaler.transform(X_test)

Principal Component Analysis

0) Import the model


In [ ]:
from sklearn.decomposition import PCA

1) Instantiate the model


In [ ]:
pca = PCA(n_components=2)

2) Fit to training data


In [ ]:
pca.fit(X)

3) Transform to lower-dimensional representation


In [ ]:
print(X.shape)
X_pca = pca.transform(X)
X_pca.shape

Visualize


In [ ]:
plt.figure()
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)

In [ ]:
pca.components_.shape

In [ ]:
plt.matshow(pca.components_[0].reshape(8, 8), cmap="gray")
plt.colorbar()
plt.matshow(pca.components_[1].reshape(8, 8), cmap="gray")
plt.colorbar()

Manifold Learning


In [ ]:
from sklearn.manifold import Isomap
isomap = Isomap()

In [ ]:
X_isomap = isomap.fit_transform(X)

In [ ]:
plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)

Exercises

  • Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).
  • Extract non-negative components from the digits dataset using NMF. Visualize the resulting components. The interface of NMF is identical to the PCA one. What qualitative difference can you find compared to PCA?

In [ ]:
# %load solutions/digits_unsupervised.py
from sklearn.manifold import TSNE
from sklearn.decomposition import NMF

# Compute TSNE embedding
tsne = TSNE()
X_tsne = tsne.fit_transform(X)

# Visualize TSNE results
plt.title("All classes")
plt.figure()
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)

# build an NMF factorization of the digits dataset
nmf = NMF(n_components=16).fit(X)

# visualize the components
fig, axes = plt.subplots(4, 4)
for ax, component in zip(axes.ravel(), nmf.components_):
    ax.imshow(component.reshape(8, 8), cmap="gray", interpolation="nearest")

In [ ]: